MONet Community Science Meeting 2023
This RMarkdown document contains code and scripts for preliminary processing and data visualization of soil biogeochemistry data. These graphs were created for use in Session 3 (Biogeochemistry Data Tutorial) on Tuesday, November 7.
Load packages
library(tidyverse) # for general data wrangling and visualization
library(maptools); library(sf) # for maps
library(ggthemes) # for theme_map
library(ggcorrplot) # to plot correlation matrix
library(soilpalettes) # for color palettes
# Install `soilpalettes` using: install.packages("devtools"); devtools::install_github("kaizadp/soilpalettes")
theme_set(theme_bw(base_size = 12)) # set default ggplot theme
Import files
bgc_data = read_csv("Biogeochem/data/bgc_data.csv")
metadata = read_csv("Biogeochem/data/bgc_metadata.csv")
bgc_analyses = read_csv("Biogeochem/data/bgc_analyses.csv") %>% dplyr::select(column, abbreviated, analysis_type)
Process the data. We will use the data in two forms - longform and wideform.
data_long =
bgc_data %>%
pivot_longer(cols = -c(Site_Code, Location), names_to = "column") %>%
left_join(bgc_analyses) %>%
left_join(metadata %>% dplyr::select(Site_Code, Lat, Long, biome_name))
data_wide =
data_long %>%
dplyr::select(-c(column, analysis_type)) %>%
pivot_wider(names_from = "abbreviated", values_from = "value") %>%
mutate(Location = factor(Location, levels = c("TOP", "BTM")))
data_long %>%
ggplot(aes(x = abbreviated, y = value, color = Location))+
geom_jitter(width = 0.3)+
facet_wrap(~analysis_type, scales = "free")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The samples were collected from sites spread across five biomes
## Create summary of biome counts
biome_numbers =
data_long %>%
distinct(Site_Code, biome_name) %>%
group_by(biome_name) %>%
dplyr::summarise(n = n()) %>%
force()
biome_numbers %>% knitr::kable()
| biome_name | n |
|---|---|
| Deserts & Xeric Shrublands | 13 |
| Mediterranean Forests, Woodlands & Scrub | 1 |
| Temperate Broadleaf & Mixed Forests | 21 |
| Temperate Conifer Forests | 19 |
| Temperate Grasslands, Savannas & Shrublands | 11 |
| NA | 5 |
# biome map
make_map_biome = function(bgc_wide, VAR, TITLE = "", LEGEND = VAR){
## Set CRS
common_crs <- 4326
## Set map size and point size
point_size <- 2
map_width = 9
map_height = 6
# Set up map layers for plotting
## Make US states map cropped to contiguous region
us <- read_sf("Biogeochem/cb_2018_us_state_5m/cb_2018_us_state_5m.shp") %>%
st_transform(., crs = common_crs)
us_bbox <- c(xmin = -125, xmax = -60, ymin = 20, ymax = 50)
region <- st_crop(us, y = us_bbox)
# make a dataset merging metadata with site lat-longs
df_map <-
bgc_wide %>%
filter(!is.na(Lat) & !is.na(Long)) %>%
st_as_sf(., coords = c("Long", "Lat"), crs = common_crs)
# Make the base map with all sites
## base_plot <-
ggplot() +
geom_sf(data = region, fill = "white", color = "grey70") +
geom_sf(data = df_map, aes_string(fill = VAR), size = 7, shape = 21, color = "black", stroke = 1) +
labs(title = TITLE,
fill = LEGEND)+
ggthemes::theme_map() +
theme(legend.background = element_rect(fill = alpha("white", 0.0)),
legend.key = element_rect(fill = "transparent"),
legend.key.size = unit(1, "cm"),
legend.text = element_text(size = 12),
legend.title = element_text(size = 12, face = "bold", vjust = 0.7),
legend.position = "right",
plot.title = element_text(size = 14, face = "bold"),
plot.background = element_rect(color = "black", fill = NA, linewidth = 1)) +
# scale_color_viridis_c(option = "plasma") +
scale_fill_manual(values = soilpalettes::soil_palette("redox2", 5))+
NULL
}
make_map_biome(data_wide, VAR = "biome_name")
Use this function to set up the correlation plots for the entire
dataset.
We use the ggcorrplot package for this.
fit_correlations_function = function(dat, TITLE = ""){
num =
dat %>%
dplyr::select(where(is.numeric)) %>%
drop_na()
num_clean =
num %>%
rownames_to_column("row") %>%
pivot_longer(-row) %>%
separate(name, sep = "_", into = c("name")) %>%
pivot_wider() %>%
dplyr::select(-row)
m = cor(num_clean)
p.mat <- ggcorrplot::cor_pmat(num_clean)
ggcorrplot::ggcorrplot(m, type = "lower",
p.mat = p.mat,
outline.color = "black",
# lab = TRUE,
insig = "blank",
colors = c("#E46726", "white", "#6D9EC1"),
title = TITLE)
}
# this will generate a correlation matrix for the entire BGC dataset
corr_all = fit_correlations_function(data_wide %>% dplyr::select(-Lat, -Long, -GWC))
# However, there are a lot of variables here.
# So, for an easier plot, subset select variables and then re-run the correlation matrix.
data_wide_subset = data_wide %>%
dplyr::select(Ca, Mg, Na, K, Bases, CEC,
Sand, Silt, Clay, SO4S, NO3N, NH4N,
TC, TN, pH, Lat, Long, Location)
fit_correlations_function(data_wide_subset %>% filter(Location == "TOP"))
Individual Correlations
data_wide %>%
ggplot(aes(x = TC, y = TN))+
geom_point()+
geom_smooth(method = "lm", se = F)+
labs(x = "Total C (%)",
y = "Total N (%)")
data_wide %>%
ggplot(aes(x = Clay, y = Sand))+
geom_point()+
geom_smooth(method = "lm", se = F)+
labs(x = "Clay (%)",
y = "Sand (%)")
data_wide %>%
ggplot(aes(y = CEC, x = Clay))+
geom_point()+
geom_smooth(method = "lm", se = F)+
labs(y = "Cation Exchange Capacity (meq/100g)",
x = "Clay (%)")
data_wide %>%
ggplot(aes(x = Ca, y = Bases))+
geom_point()+
geom_smooth(method = "lm", se = F)+
labs(y = "Total bases (meq/100g)",
x = "Calcium (meq/100g)")
data_wide %>%
ggplot(aes(x = TC, y = WEOC))+
geom_point()+
geom_smooth(method = "lm", se = F)+
labs(x = "Total carbon (%)",
y = "Water-extractable organic carbon (mg/g)")
These are “jittered” scatter plots showing distribution of the
variables by depth or by biome.
Also included is code to calculate the analysis of variance (ANOVA)
TC ANOVA
lm((TC) ~ Location * biome_name, data = data_wide) %>% car::Anova()
## Anova Table (Type II tests)
##
## Response: (TC)
## Sum Sq Df F value Pr(>F)
## Location 151.92 1 24.2377 2.9e-06 ***
## biome_name 92.18 4 3.6768 0.007448 **
## Location:biome_name 61.48 4 2.4521 0.049940 *
## Residuals 714.55 114
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot TC by depth
data_wide %>%
ggplot(aes(x = Location, y = TC, color = Location))+
geom_boxplot(position = position_dodge(width = 0.5), width = 0.3, outlier.colour = NA)+
geom_jitter(size = 3, width = 0.3)
plot TC by biome
set.seed(1234)
data_wide %>%
ggplot(aes(x = biome_name, y = TC, color = Location))+
geom_boxplot(position = position_dodge(width = 0.5), width = 0.3)+
geom_point(size = 3,
position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.5))+
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
labs(x = "")
CEC ANOVA
lm((CEC) ~ Location * biome_name, data = data_wide) %>% car::Anova()
## Anova Table (Type II tests)
##
## Response: (CEC)
## Sum Sq Df F value Pr(>F)
## Location 100.0 1 0.7992 0.3732
## biome_name 3283.7 4 6.5619 8.661e-05 ***
## Location:biome_name 466.0 4 0.9311 0.4486
## Residuals 14261.9 114
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot CEC by depth
data_wide %>%
ggplot(aes(x = Location, y = CEC, color = Location))+
geom_boxplot(position = position_dodge(width = 0.5), width = 0.3, outlier.colour = NA)+
geom_jitter(size = 3, width = 0.3)+
labs(x = "")
plot CEC by biome
set.seed(1234)
data_wide %>%
ggplot(aes(x = biome_name, y = CEC, color = Location))+
#geom_jitter(size = 3, width = 0.2)+
geom_boxplot(position = position_dodge(width = 0.5), width = 0.3)+
geom_point(size = 3,
position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.5))+
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
labs(x = "")
Clay ANOVA
lm((Clay) ~ Location * biome_name, data = data_wide) %>% car::Anova()
## Anova Table (Type II tests)
##
## Response: (Clay)
## Sum Sq Df F value Pr(>F)
## Location 483.7 1 5.2195 0.02412 *
## biome_name 2574.6 4 6.9449 4.686e-05 ***
## Location:biome_name 88.4 4 0.2385 0.91607
## Residuals 10936.1 118
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot Clay by depth
data_wide %>%
ggplot(aes(x = Location, y = Clay, color = Location))+
geom_boxplot(position = position_dodge(width = 0.5), width = 0.3, outlier.colour = NA)+
geom_jitter(size = 3, width = 0.3)+
labs(x = "")
plot Clay by biome
set.seed(1234)
data_wide %>%
ggplot(aes(x = biome_name, y = Clay, color = Location))+
geom_boxplot(position = position_dodge(width = 0.5), width = 0.3)+
geom_point(size = 3,
position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.5))+
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
labs(x = "")
WEOC ANOVA
lm((WEOC) ~ Location * biome_name, data = data_wide) %>% car::Anova()
## Anova Table (Type II tests)
##
## Response: (WEOC)
## Sum Sq Df F value Pr(>F)
## Location 0.16630 1 12.7321 0.0005222 ***
## biome_name 0.14170 4 2.7122 0.0333182 *
## Location:biome_name 0.05427 4 1.0387 0.3903703
## Residuals 1.52822 117
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot WEOC by depth
data_wide %>%
ggplot(aes(x = Location, y = WEOC, color = Location))+
geom_boxplot(position = position_dodge(width = 0.5), width = 0.3, outlier.colour = NA)+
geom_jitter(size = 3, width = 0.3)+
labs(x = "")
plot WEOC by biome
set.seed(1234)
data_wide %>%
ggplot(aes(x = biome_name, y = WEOC, color = Location))+
geom_boxplot(position = position_dodge(width = 0.5), width = 0.3)+
geom_point(size = 3,
position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.5))+
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
labs(x = "")
This section has code to plot variables in a spatial manner on
maps.
The easiest way to do this is to run the custom make_map
function first, which will pre-load the map and other parameters.
You then select the variable you want to plot below.
make_map = function(bgc_wide, VAR, TITLE = "", LEGEND = VAR){
## Set CRS
common_crs <- 4326
## Set map size and point size
point_size <- 2
map_width = 9
map_height = 6
# Set up map layers for plotting
## Make US states map cropped to contiguous region
us <- read_sf("Biogeochem/cb_2018_us_state_5m/cb_2018_us_state_5m.shp") %>%
st_transform(., crs = common_crs)
us_bbox <- c(xmin = -125, xmax = -60, ymin = 20, ymax = 50)
region <- st_crop(us, y = us_bbox)
# make a dataset merging metadata with site lat-longs
df_map <-
bgc_wide %>%
filter(!is.na(Lat) & !is.na(Long)) %>%
st_as_sf(., coords = c("Long", "Lat"), crs = common_crs)
# Make the base map with all sites
## base_plot <-
ggplot() +
geom_sf(data = region, fill = "white", color = "grey70") +
geom_sf(data = df_map, aes_string(fill = VAR), size = 7, shape = 21, color = "black", stroke = 1) +
labs(title = TITLE,
fill = LEGEND)+
ggthemes::theme_map() +
theme(legend.background = element_rect(fill = alpha("white", 0.0)),
legend.key = element_rect(fill = "transparent"),
legend.key.size = unit(1, "cm"),
legend.text = element_text(size = 12),
legend.title = element_text(size = 12, face = "bold", vjust = 0.7),
legend.position = "bottom",
plot.title = element_text(size = 14, face = "bold"),
plot.background = element_rect(color = "black", fill = NA, linewidth = 1)) +
# scale_color_viridis_c(option = "plasma") +
scale_fill_gradientn(colors = soilpalettes::soil_palette("redox2", 5))
}
Now, make the maps
make_map(data_wide %>% filter(Location == "TOP"), VAR = "TC", LEGEND = "TC (%)")
make_map(data_wide %>% filter(Location == "TOP"), VAR = "WEOC", LEGEND = "WEOC (mg/g)")
make_map(data_wide %>% filter(Location == "TOP"), VAR = "MBC", LEGEND = "MBC (mg/g)")
make_map(data_wide %>% filter(Location == "TOP"), VAR = "TN", LEGEND = "TN (%)")
make_map(data_wide %>% filter(Location == "BTM"), VAR = "CEC", LEGEND = "CEC (meq/100g)")
make_map(data_wide %>% filter(Location == "TOP"), VAR = "Clay", LEGEND = "Clay (%)")